{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A1  nfu.edu.cn \n",
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/tzgg/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_通知公告.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_通知公告.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'https://www.nfu.edu.cn/tzgg/f7cc3ba20c4949b1b56ef7d00b0b967b.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/8ac78f2552af46e0b93ba2e18bdea7da.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/f00b6f6c563a4ec78d257c3fd1e17cf5.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/78963fcbb3a04775a54e231a1c0dd8aa.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/7c079ea83e704a1a835aa83cbe8e2516.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/d1ed71a3a8d34c109520a60fe293e268.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/2141270bfd3d4f1a8d6fd6d319fb5568.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/dc0aa0e0286a465e9dd0192970e8b8b2.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/24740a36aa804ebda83d7eee1ac68d43.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/ac41c5eaf4f54f0e8d23b583efa8e430.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/fcf752d5e4be4855a902e3f254fc0c60.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/fe3885d384cb4abf98588c829cf5e947.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/8eace2de9c154c8fbd0abe69ba73de7d.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/e6e4280246314defa66889205d093edd.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/edf05fc9bd5c4288aa5c9f92260ffc71.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/dc68659b0b154db9b32a6f8a24039a9f.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/2877458fc29644caad355cf84c82f281.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/366d2f7b7b1544b19824da4e2f2a89d8.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/71e34b9fd75b4db697be9ae1c4d4b9fa.htm'},\n",
       " {'https://www.nfu.edu.cn/tzgg/c62c650000ed42ca90c45aceca1baef8.htm'}]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_URL=[i.absolute_links for i in r.html.xpath('//div[@class=\"news_title\"]/a')]\n",
    "list_URL\n",
    "# 绝对路径，可直接得出链接，不需要再去拼接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 重组链接\n",
    "# list_URL  = [urllib.parse.urlunparse\\\n",
    "# ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "# for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "# list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>我校学子参加第十一届蓝桥杯大赛个人赛（第二场）省赛获奖</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/f7cc3ba20c4949b1b...</td>\n",
       "      <td>2020-11-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>“读大学，究竟读什么” ——护理与健康学院2020级新生成长导师见面会顺利举行</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/8ac78f2552af46e0b...</td>\n",
       "      <td>2020-11-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>大英中心2020年“晨读月”语音特训营圆满落幕</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/f00b6f6c563a4ec78...</td>\n",
       "      <td>2020-11-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>护理与健康学院直属党支部学生党支部学习“习近平总书记在纪念中国人民志愿军抗美援朝出国作战70...</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/78963fcbb3a04775a...</td>\n",
       "      <td>2020-11-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>我校18、19级英才班课程及教学情况学生座谈会顺利展开</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/7c079ea83e704a1a8...</td>\n",
       "      <td>2020-11-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>我校英才班经验分享会顺利举行</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/d1ed71a3a8d34c109...</td>\n",
       "      <td>2020-11-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>护理与健康学院2020年综合测评修订改革会议顺利开展</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/2141270bfd3d4f1a8...</td>\n",
       "      <td>2020-11-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>《日本动漫史——专业选修课》开课啦！</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/dc0aa0e0286a465e9...</td>\n",
       "      <td>2020-10-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>我校学子在全国高校思想政治理论课2020年寒假实践活动中荣获佳绩</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/24740a36aa804ebda...</td>\n",
       "      <td>2020-10-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>护理与健康学院第五届护理技能竞赛圆满结束</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/ac41c5eaf4f54f0e8...</td>\n",
       "      <td>2020-10-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>我校护理与健康学院名家讲座第三讲圆满结束</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/fcf752d5e4be4855a...</td>\n",
       "      <td>2020-10-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>我校云康医学与健康管理学院病理技术方向教学计划及课程设置专家研讨会顺利召开</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/fe3885d384cb4abf9...</td>\n",
       "      <td>2020-10-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>会计学院党总支2020年秋季发展对象培训课顺利完成</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/8eace2de9c154c8fb...</td>\n",
       "      <td>2020-10-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>岭南集团导师林津翘莅临我院召开审计人员职业生涯规划讲座</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/e6e4280246314defa...</td>\n",
       "      <td>2020-10-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>我校电气与计算机工程学院学子在中国大学生计算机设计大赛再创佳绩！</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/edf05fc9bd5c4288a...</td>\n",
       "      <td>2020-10-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>护理与健康学院第三十三期师生面对面圆满结束</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/dc68659b0b154db9b...</td>\n",
       "      <td>2020-10-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>护理与健康学院2020年青年教师培训会议顺利开展</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/2877458fc29644caa...</td>\n",
       "      <td>2020-10-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>大英中心“晨读月”2020级实习领读员动员大会圆满结束</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/366d2f7b7b1544b19...</td>\n",
       "      <td>2020-10-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>护理与健康学院直属党支部学生党支部召开2020年秋季通表大会</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/71e34b9fd75b4db69...</td>\n",
       "      <td>2020-10-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>普华永道You Plus特训计划项目宣讲会顺利举行</td>\n",
       "      <td>{https://www.nfu.edu.cn/tzgg/c62c650000ed42ca9...</td>\n",
       "      <td>2020-10-27</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   标题  \\\n",
       "0                         我校学子参加第十一届蓝桥杯大赛个人赛（第二场）省赛获奖   \n",
       "1             “读大学，究竟读什么” ——护理与健康学院2020级新生成长导师见面会顺利举行   \n",
       "2                             大英中心2020年“晨读月”语音特训营圆满落幕   \n",
       "3   护理与健康学院直属党支部学生党支部学习“习近平总书记在纪念中国人民志愿军抗美援朝出国作战70...   \n",
       "4                         我校18、19级英才班课程及教学情况学生座谈会顺利展开   \n",
       "5                                      我校英才班经验分享会顺利举行   \n",
       "6                          护理与健康学院2020年综合测评修订改革会议顺利开展   \n",
       "7                                  《日本动漫史——专业选修课》开课啦！   \n",
       "8                    我校学子在全国高校思想政治理论课2020年寒假实践活动中荣获佳绩   \n",
       "9                                护理与健康学院第五届护理技能竞赛圆满结束   \n",
       "10                               我校护理与健康学院名家讲座第三讲圆满结束   \n",
       "11              我校云康医学与健康管理学院病理技术方向教学计划及课程设置专家研讨会顺利召开   \n",
       "12                          会计学院党总支2020年秋季发展对象培训课顺利完成   \n",
       "13                        岭南集团导师林津翘莅临我院召开审计人员职业生涯规划讲座   \n",
       "14                   我校电气与计算机工程学院学子在中国大学生计算机设计大赛再创佳绩！   \n",
       "15                              护理与健康学院第三十三期师生面对面圆满结束   \n",
       "16                           护理与健康学院2020年青年教师培训会议顺利开展   \n",
       "17                        大英中心“晨读月”2020级实习领读员动员大会圆满结束   \n",
       "18                     护理与健康学院直属党支部学生党支部召开2020年秋季通表大会   \n",
       "19                          普华永道You Plus特训计划项目宣讲会顺利举行   \n",
       "\n",
       "                                                   链结          日期  \n",
       "0   {https://www.nfu.edu.cn/tzgg/f7cc3ba20c4949b1b...  2020-11-03  \n",
       "1   {https://www.nfu.edu.cn/tzgg/8ac78f2552af46e0b...  2020-11-03  \n",
       "2   {https://www.nfu.edu.cn/tzgg/f00b6f6c563a4ec78...  2020-11-02  \n",
       "3   {https://www.nfu.edu.cn/tzgg/78963fcbb3a04775a...  2020-11-02  \n",
       "4   {https://www.nfu.edu.cn/tzgg/7c079ea83e704a1a8...  2020-11-02  \n",
       "5   {https://www.nfu.edu.cn/tzgg/d1ed71a3a8d34c109...  2020-11-02  \n",
       "6   {https://www.nfu.edu.cn/tzgg/2141270bfd3d4f1a8...  2020-11-02  \n",
       "7   {https://www.nfu.edu.cn/tzgg/dc0aa0e0286a465e9...  2020-10-30  \n",
       "8   {https://www.nfu.edu.cn/tzgg/24740a36aa804ebda...  2020-10-30  \n",
       "9   {https://www.nfu.edu.cn/tzgg/ac41c5eaf4f54f0e8...  2020-10-29  \n",
       "10  {https://www.nfu.edu.cn/tzgg/fcf752d5e4be4855a...  2020-10-29  \n",
       "11  {https://www.nfu.edu.cn/tzgg/fe3885d384cb4abf9...  2020-10-29  \n",
       "12  {https://www.nfu.edu.cn/tzgg/8eace2de9c154c8fb...  2020-10-29  \n",
       "13  {https://www.nfu.edu.cn/tzgg/e6e4280246314defa...  2020-10-29  \n",
       "14  {https://www.nfu.edu.cn/tzgg/edf05fc9bd5c4288a...  2020-10-29  \n",
       "15  {https://www.nfu.edu.cn/tzgg/dc68659b0b154db9b...  2020-10-28  \n",
       "16  {https://www.nfu.edu.cn/tzgg/2877458fc29644caa...  2020-10-28  \n",
       "17  {https://www.nfu.edu.cn/tzgg/366d2f7b7b1544b19...  2020-10-28  \n",
       "18  {https://www.nfu.edu.cn/tzgg/71e34b9fd75b4db69...  2020-10-28  \n",
       "19  {https://www.nfu.edu.cn/tzgg/c62c650000ed42ca9...  2020-10-27  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 输出结果\n",
    "# B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/tzgg/index19.htm'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第一页\n",
    "base_url_01 = r.url\n",
    "base_url_01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SplitResult(scheme='https', netloc='www.nfu.edu.cn', path='/tzgg/index19.htm', query='', fragment='')"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlsplit(base_url_01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/tzgg/index19.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 第一页\n",
       "0              https\n",
       "1     www.nfu.edu.cn\n",
       "2  /tzgg/index19.htm\n",
       "3                   \n",
       "4                   "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(urllib.parse.urlsplit(base_url_01)).rename({0:\"第一页\"},axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/tzgg/index2.htm'"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第二页\n",
    "base_url_02 = session.get('https://www.nfu.edu.cn/tzgg/index2.htm').url\n",
    "base_url_02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "      <th>第二页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/tzgg/index19.htm</td>\n",
       "      <td>/tzgg/index2.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 第一页               第二页\n",
       "0              https             https\n",
       "1     www.nfu.edu.cn    www.nfu.edu.cn\n",
       "2  /tzgg/index19.htm  /tzgg/index2.htm\n",
       "3                                     \n",
       "4                                     "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['第二页'] = urllib.parse.urlsplit(base_url_02)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "35\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/tzgg/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break\n",
    "# so page = 19?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/tzgg/index1.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index2.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index3.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index4.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index5.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index6.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index7.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index8.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index9.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index10.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index11.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index12.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index13.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index14.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index15.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index16.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index17.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index18.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index19.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index20.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index21.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index22.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index23.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index24.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index25.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index26.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index27.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index28.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index29.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index30.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index31.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index32.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index33.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index34.htm']"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/tzgg/index'+str(i)+'.htm' for i in range(1,35)]\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/tzgg/index.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index1.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index2.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index3.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index4.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index5.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index6.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index7.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index8.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index9.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index10.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index11.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index12.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index13.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index14.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index15.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index16.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index17.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index18.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index19.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index20.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index21.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index22.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index23.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index24.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index25.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index26.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index27.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index28.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index29.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index30.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index31.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index32.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index33.htm',\n",
       " 'https://www.nfu.edu.cn/tzgg/index34.htm']"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group.insert(0,'https://www.nfu.edu.cn/tzgg/index.htm')\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/tzgg/index.htm'"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath 准备：\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index22.htm', 'index23.htm', 'index24.htm', 'index25.htm', 'index26.htm', 'index27.htm', 'index28.htm', 'index29.htm', 'index3.htm', 'index30.htm', 'index31.htm', 'index32.htm', 'index33.htm', 'index34.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>校园管理部关于2021年元旦放假校园生活服务安排的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/cd60e06378e5449294...</td>\n",
       "      <td>2020-12-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>中山大学南方学院关于2021年元旦放假安排的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/16fcbd56eab04220b3...</td>\n",
       "      <td>2020-12-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>关于开展2020年知识产权竞赛的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/155655d4a7e74c7695...</td>\n",
       "      <td>2020-12-16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>中山大学南方学院关于举办2020年预防艾滋病巡讲活动的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/f381db0e5b3e4746b3...</td>\n",
       "      <td>2020-12-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>关于开展2020年安全知识竞赛的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/ae83ecc6ce894bcb81...</td>\n",
       "      <td>2020-12-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>562</th>\n",
       "      <td>2</td>\n",
       "      <td>“南苑青年”系列讲座之第十三讲的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/bbd14d55a99247a79f...</td>\n",
       "      <td>2015-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>563</th>\n",
       "      <td>3</td>\n",
       "      <td>中山大学南方学院关于举办“南方湖畔·艺彩纷呈”第七届校园文化艺术节活动通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/a8e5e752e409486da2...</td>\n",
       "      <td>2015-04-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>564</th>\n",
       "      <td>4</td>\n",
       "      <td>学院办公室关于2015年五一放假安排的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/e3f763049ee54cfc8c...</td>\n",
       "      <td>2015-04-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>565</th>\n",
       "      <td>5</td>\n",
       "      <td>中山大学南方学院关于2015年公共机构节能宣传作品征集活动的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/f3ae1aa3ccdb4d87bc...</td>\n",
       "      <td>2015-04-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>566</th>\n",
       "      <td>6</td>\n",
       "      <td>关于开展校园网络和运营商移动网络使用情况调查的通知</td>\n",
       "      <td>https://www.nfu.edu.cn/tzgg/6de44f6a618540ef82...</td>\n",
       "      <td>1970-01-01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>687 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                     标题  \\\n",
       "0        0            校园管理部关于2021年元旦放假校园生活服务安排的通知   \n",
       "1        1               中山大学南方学院关于2021年元旦放假安排的通知   \n",
       "2        2                     关于开展2020年知识产权竞赛的通知   \n",
       "3        3          中山大学南方学院关于举办2020年预防艾滋病巡讲活动的通知   \n",
       "4        4                     关于开展2020年安全知识竞赛的通知   \n",
       "..     ...                                    ...   \n",
       "562      2                     “南苑青年”系列讲座之第十三讲的通知   \n",
       "563      3  中山大学南方学院关于举办“南方湖畔·艺彩纷呈”第七届校园文化艺术节活动通知   \n",
       "564      4                  学院办公室关于2015年五一放假安排的通知   \n",
       "565      5       中山大学南方学院关于2015年公共机构节能宣传作品征集活动的通知   \n",
       "566      6              关于开展校园网络和运营商移动网络使用情况调查的通知   \n",
       "\n",
       "                                                    链结          日期  \n",
       "0    https://www.nfu.edu.cn/tzgg/cd60e06378e5449294...  2020-12-25  \n",
       "1    https://www.nfu.edu.cn/tzgg/16fcbd56eab04220b3...  2020-12-17  \n",
       "2    https://www.nfu.edu.cn/tzgg/155655d4a7e74c7695...  2020-12-16  \n",
       "3    https://www.nfu.edu.cn/tzgg/f381db0e5b3e4746b3...  2020-12-03  \n",
       "4    https://www.nfu.edu.cn/tzgg/ae83ecc6ce894bcb81...  2020-12-03  \n",
       "..                                                 ...         ...  \n",
       "562  https://www.nfu.edu.cn/tzgg/bbd14d55a99247a79f...  2015-04-08  \n",
       "563  https://www.nfu.edu.cn/tzgg/a8e5e752e409486da2...  2015-04-07  \n",
       "564  https://www.nfu.edu.cn/tzgg/e3f763049ee54cfc8c...  2015-04-01  \n",
       "565  https://www.nfu.edu.cn/tzgg/f3ae1aa3ccdb4d87bc...  2015-04-01  \n",
       "566  https://www.nfu.edu.cn/tzgg/6de44f6a618540ef82...  1970-01-01  \n",
       "\n",
       "[687 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/tzgg/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/tzgg/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/通知公告.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='媒体报道')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
